In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
In [2]:
data = pd.read_csv("scholarship.csv")
In [3]:
data.head()
Out[3]:
semester_percentage scholarship_exam_marks got_scholarship
0 71.9 26 1
1 74.6 38 1
2 75.4 40 1
3 64.2 8 1
4 72.3 17 0
In [4]:
sns.distplot(data["scholarship_exam_marks"])
C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\828206924.py:1: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data["scholarship_exam_marks"])
Out[4]:
<Axes: xlabel='scholarship_exam_marks', ylabel='Density'>
No description has been provided for this image
In [5]:
data["scholarship_exam_marks"].skew()
Out[5]:
0.8356419499466834
In [14]:
data["scholarship_exam_marks"].describe()
Out[14]:
count    1000.000000
mean       32.225000
std        19.130822
min         0.000000
25%        17.000000
50%        28.000000
75%        44.000000
max       100.000000
Name: scholarship_exam_marks, dtype: float64
In [7]:
sns.boxplot(data["scholarship_exam_marks"])
Out[7]:
<Axes: ylabel='scholarship_exam_marks'>
No description has been provided for this image
In [15]:
Q1 = data["scholarship_exam_marks"].quantile(0.25)
Q3 = data["scholarship_exam_marks"].quantile(0.75)
In [16]:
IQR = Q3 - Q1
In [17]:
min = Q1 - 1.5*IQR
max = Q3 + 1.5*IQR
In [18]:
#trimming
new_data = data[(data["scholarship_exam_marks"]>min) & (data["scholarship_exam_marks"]<max)]
In [20]:
plt.figure(figsize=(16,8))
plt.subplot(2,2,1)
sns.distplot(data["scholarship_exam_marks"])

plt.subplot(2,2,2)
sns.boxplot(data["scholarship_exam_marks"])

plt.subplot(2,2,3)
sns.distplot(new_data["scholarship_exam_marks"])

plt.subplot(2,2,4)
sns.boxplot(new_data["scholarship_exam_marks"])

plt.show()
C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\2007280676.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(data["scholarship_exam_marks"])
C:\Users\Satyam\AppData\Local\Temp\ipykernel_3132\2007280676.py:9: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(new_data["scholarship_exam_marks"])
No description has been provided for this image
In [ ]:
#capping
new_data2 = data.copy()
new_data2["scholarship_exam_marks"] = np.where(data["scholarship_exam_marks"]>max,
        max, 
        np.where(data["scholarship_exam_marks"]<min,min, data["scholarship_exam_marks"]))
In [ ]:
new_data2.shape
Out[ ]:
(1000, 3)
In [ ]:
new_data2["scholarship_exam_marks"].describe()
Out[ ]:
count    1000.000000
mean       32.136500
std        18.865419
min         0.000000
25%        17.000000
50%        28.000000
75%        44.000000
max        84.500000
Name: scholarship_exam_marks, dtype: float64